library(data.table)
library(dplyr)
library(tidyverse)
library(dtplyr)
library(knitr)
library(plotly)
library(widgetframe)

Lab 11

Load both datasets in

# load COVID state-level data from NYT
cv_states <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))

# load state population data
state_pops <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
# Merge data sets
cv_states <- merge(cv_states, state_pops, by="state")

2. Look at the data

  • Inspect the dimensions, head, and tail of the data
  • Inspect the structure of each variables. Are they in the correct format?
dim(cv_states)
## [1] 51178     9
head(cv_states)
##     state       date fips   cases deaths geo_id population pop_density abb
## 1 Alabama 2020-11-02    1  194892   2973      1    4887871    96.50939  AL
## 2 Alabama 2021-07-17    1  559478  11443      1    4887871    96.50939  AL
## 3 Alabama 2021-07-06    1  551298  11358      1    4887871    96.50939  AL
## 4 Alabama 2022-02-24    1 1276580  18102      1    4887871    96.50939  AL
## 5 Alabama 2020-08-01    1   89349   1603      1    4887871    96.50939  AL
## 6 Alabama 2022-05-07    1 1302397  19601      1    4887871    96.50939  AL
tail(cv_states)
##         state       date fips  cases deaths geo_id population pop_density abb
## 51173 Wyoming 2021-12-05   56 111812   1428     56     577737    5.950611  WY
## 51174 Wyoming 2022-06-11   56 159707   1824     56     577737    5.950611  WY
## 51175 Wyoming 2022-11-01   56 178866   1914     56     577737    5.950611  WY
## 51176 Wyoming 2021-12-31   56 115638   1526     56     577737    5.950611  WY
## 51177 Wyoming 2021-10-11   56  95137   1041     56     577737    5.950611  WY
## 51178 Wyoming 2021-10-19   56  98567   1136     56     577737    5.950611  WY
str(cv_states)
## 'data.frame':    51178 obs. of  9 variables:
##  $ state      : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ date       : IDate, format: "2020-11-02" "2021-07-17" ...
##  $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ cases      : int  194892 559478 551298 1276580 89349 1302397 1524549 1259580 547135 546540 ...
##  $ deaths     : int  2973 11443 11358 18102 1603 19601 20423 17505 11252 11220 ...
##  $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
##  $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
##  $ abb        : chr  "AL" "AL" "AL" "AL" ...

3. Format the data

# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")
# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
# order the data first by state, second by date
cv_states = cv_states[order(cv_states$state, cv_states$date),]
# Confirm the variables are now correctly formatted
str(cv_states)
## 'data.frame':    51178 obs. of  9 variables:
##  $ state      : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ date       : Date, format: "2020-03-13" "2020-03-14" ...
##  $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ cases      : int  6 12 23 29 39 51 78 106 131 157 ...
##  $ deaths     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
##  $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
##  $ abb        : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
##       state       date fips cases deaths geo_id population pop_density abb
## 769 Alabama 2020-03-13    1     6      0      1    4887871    96.50939  AL
## 574 Alabama 2020-03-14    1    12      0      1    4887871    96.50939  AL
## 830 Alabama 2020-03-15    1    23      0      1    4887871    96.50939  AL
## 315 Alabama 2020-03-16    1    29      0      1    4887871    96.50939  AL
## 612 Alabama 2020-03-17    1    39      0      1    4887871    96.50939  AL
## 421 Alabama 2020-03-18    1    51      0      1    4887871    96.50939  AL
tail(cv_states)
##         state       date fips  cases deaths geo_id population pop_density abb
## 50415 Wyoming 2022-11-05   56 178866   1914     56     577737    5.950611  WY
## 50836 Wyoming 2022-11-06   56 178866   1914     56     577737    5.950611  WY
## 50530 Wyoming 2022-11-07   56 178866   1914     56     577737    5.950611  WY
## 51159 Wyoming 2022-11-08   56 179366   1917     56     577737    5.950611  WY
## 51162 Wyoming 2022-11-09   56 179366   1917     56     577737    5.950611  WY
## 50939 Wyoming 2022-11-10   56 179366   1917     56     577737    5.950611  WY
# Inspect range of values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
##       state       date fips cases deaths geo_id population pop_density abb
## 769 Alabama 2020-03-13    1     6      0      1    4887871    96.50939  AL
## 574 Alabama 2020-03-14    1    12      0      1    4887871    96.50939  AL
## 830 Alabama 2020-03-15    1    23      0      1    4887871    96.50939  AL
## 315 Alabama 2020-03-16    1    29      0      1    4887871    96.50939  AL
## 612 Alabama 2020-03-17    1    39      0      1    4887871    96.50939  AL
## 421 Alabama 2020-03-18    1    51      0      1    4887871    96.50939  AL
summary(cv_states)
##            state            date                 fips           cases         
##  Washington   : 1025   Min.   :2020-01-21   Min.   : 1.00   Min.   :       1  
##  Illinois     : 1022   1st Qu.:2020-11-02   1st Qu.:16.00   1st Qu.:   89020  
##  California   : 1021   Median :2021-07-06   Median :29.00   Median :  342322  
##  Arizona      : 1020   Mean   :2021-07-06   Mean   :29.78   Mean   :  813572  
##  Massachusetts: 1014   3rd Qu.:2022-03-09   3rd Qu.:44.00   3rd Qu.:  957840  
##  Wisconsin    : 1010   Max.   :2022-11-10   Max.   :72.00   Max.   :11411718  
##  (Other)      :45066                                                          
##      deaths          geo_id        population        pop_density       
##  Min.   :    0   Min.   : 1.00   Min.   :  577737   Min.   :    1.292  
##  1st Qu.: 1367   1st Qu.:16.00   1st Qu.: 1805832   1st Qu.:   43.659  
##  Median : 5114   Median :29.00   Median : 4468402   Median :  107.860  
##  Mean   :11371   Mean   :29.78   Mean   : 6404024   Mean   :  422.944  
##  3rd Qu.:14391   3rd Qu.:44.00   3rd Qu.: 7535591   3rd Qu.:  229.511  
##  Max.   :97142   Max.   :72.00   Max.   :39557045   Max.   :11490.120  
##                                                     NA's   :973        
##       abb       
##  WA     : 1025  
##  IL     : 1022  
##  CA     : 1021  
##  AZ     : 1020  
##  MA     : 1014  
##  WI     : 1010  
##  (Other):45066
min(cv_states$date)
## [1] "2020-01-21"
max(cv_states$date)
## [1] "2022-11-10"
# Add variables for new_cases and new_deaths


for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])
  cv_subset = cv_subset[order(cv_subset$date),]
  
  # add starting level for new cases and deaths
  cv_subset$new_cases = cv_subset$cases[1]
  cv_subset$new_deaths = cv_subset$deaths[1]
  

  for (j in 2:nrow(cv_subset)) {
    cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j-1]
    cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j-1]
  }
  
  # include in main dataset
  cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
  cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}
# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2022-06-01")
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1<-NULL # to clear from workspace
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2<-NULL # to clear from workspace
# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0
# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`

for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])
  
  # add starting level for new cases and deaths
  cv_subset$cases = cv_subset$cases[1]
  cv_subset$deaths = cv_subset$deaths[1]
  
  ### FINISH CODE HERE
  for (j in 2:nrow(cv_subset)) {
    cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j-1]
    cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j-1]
  }
  
  # include in main dataset
  cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
  cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}
library(zoo)

# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)

cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)
# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2=NULL

5. Add additional variables

# add population normalized (by 100,000) counts for each variable
cv_states$per100k =  as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
cv_states$newper100k =  as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
cv_states$deathsper100k =  as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
cv_states$newdeathsper100k =  as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))
# create a `cv_states_today` dataset
cv_states_today = subset(cv_states, date==max(cv_states$date))

II. Scatterplots

6. Explore scatterplots using plot_ly()

# pop_density vs. cases
cv_states_today %>% 
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))